#########
## Nature Human Behaviour 
## Leading Countries in Global Science Increasingly Receive More Citations than Other Countries Despite Doing Similar Research.
## https://doi.org/10.1038/s41562-022-01351-5
## Harvard Dataverse (Code and Metadata): https://doi.org/10.7910/DVN/WCOINR 
## Step 0G
## Data: Data_20210905
#########

# At a terminal, run: 
# '''Note: '$1' is the discipline ID that you pass in.'''
# '''Note: the second parameter is fed in as either 'english_only' or 'all' '''
# '''where 'english_only' are the main results and 'all' are the SI results. '''
# ml python/3.6.1
# ml py-ipython/6.1.0_py36 python/3.6.1 py-scipy/1.1.0_py36 py-scikit-learn/0.19.1_py36 py-pandas/0.23.0_py36 gcc/10.1.0 py-pytorch/1.4.0_py36
# ml py-numpy/1.17.2_py36
# export PYTHONPATH=$GROUP_HOME/python/lib/python3.6/site-packages:$PYTHONPATH
# srun python3 -u Step_X0G_Python3_RR_MAG_Journal_Censored_NLLDA_Topic_Coherence_Scores.py "$1" "english_only"


#############################
### Input Discipline
#############################
# E.g., discipline = '147176958'
import sys
discipline = str(sys.argv[1])
language = str(sys.argv[2])

#############################
### Time Start
#############################
import time 
start_time = time.time()

###################################
##### Modules
###################################
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import glob
import datetime
import gc
import numpy as np 

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

import bz2 
import pickle

from Python_Class_LLDA import LLDA

# Mulitprocessing
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing


for journal_censoring in ["Since_1980"]:

	###################################
	##### Read in Filename to NLLDA
	###################################

	if language == "english_only":
		TopicCoherence_Filename = "OUTPUT_Python_MAG_Journal_Censored_"+str(journal_censoring)+"_Yearly_NLLDA_Topic_Coherence_Scores_EnglishOnly_"+str(discipline)+".pbz2"
		NLLDA_Filename = "OUTPUT_Python_MAG_Journal_Censored_"+str(journal_censoring)+"_Yearly_NLLDA_Dict_Corpus_RAKE_and_GoogleAPI_EnglishOnly_"+str(discipline)+".pbz2"
		if os.path.exists(TopicCoherence_Filename) or os.path.exists(NLLDA_Filename)==False:
			continue
	else:
		TopicCoherence_Filename = "OUTPUT_Python_MAG_Journal_Censored_"+str(journal_censoring)+"_Yearly_NLLDA_Topic_Coherence_Scores_All_"+str(discipline)+".pbz2"
		NLLDA_Filename = "OUTPUT_Python_MAG_Journal_Censored_"+str(journal_censoring)+"_Yearly_NLLDA_Dict_Corpus_RAKE_and_GoogleAPI_All_"+str(discipline)+".pbz2"
		if os.path.exists(TopicCoherence_Filename) or os.path.exists(NLLDA_Filename)==False:
			continue

	print("Trying NLLDA Topic Coherence: "+str(discipline))

	###################################
	##### Load in LLDA Dictionary | Beta = 0.1
	###################################
	NLLDA_Min_Year = 1980 #The Earliest Year of the LLDA
	NLLDA_Max_Year = 2017 #The Last Year

	'''
	What It Does: Read in NLLDA Files as a pickle
	cPickle file saved as a dictionary of LLDA Models
	So, Load in that way. 
	'''
	f = bz2.BZ2File(NLLDA_Filename, 'rb')
	Dictionary_of_NLLDA = {}
	# "Betas" are NLLDA hyperparameters
	# Here, we're only considering Beta = 0.1
	for beta in [0.1,0.5,0.9]:
		if beta == 0.1:
			Dictionary_of_NLLDA = {}
			for year in range(NLLDA_Min_Year,NLLDA_Max_Year+1):
				try:
					print("beta "+str(beta)+" year "+str(year))
					Dictionary_of_NLLDA[year] = pickle.load(f)
				except:
					Dictionary_of_NLLDA[year] = {}

	###################################
	##### Create Phi DataFrames for Each Year
	###################################
	'''
	What It Does: Extract Phi Matrices, Country Labels (columns), and Vocas IDs (indices) into Dictionaries
	'''
	phi_normed_dictionary = {}
	phi_raw_dictionary = {}

	for year_ in range(NLLDA_Min_Year,NLLDA_Max_Year+1):
		try:
			print("Phi Year: "+str(year_))

			# Extract necessary parts from NLLDA Dictionary you loaded in
			phi_ = Dictionary_of_NLLDA[year_].phi()
			labelmap_ = Dictionary_of_NLLDA[year_].labelmap
			vocas_ = Dictionary_of_NLLDA[year_].vocas_id

			# Swap label and vocas dictionaries so that they are id number first
			# E.g., US:3 becomes 3:US, where 3 is the fourth column in Phi
			# E.g., Social Capital:34 becomes 34:Social Capital, the 35th row in Phi
			labelmap_ = {v:k for k, v in labelmap_.items()}
			vocas_ = {v:k for k, v in vocas_.items()}

			### Create Phi DF where columns have country labels and rows have the vocabulary.
			### Raw | Remove lowest value to zero for 'raw' 
			### Normed | Remove lowest value and then renormalize sum to 
			### 100% then renormalize to sum.
			### *********************************
			### Sanity Check | Compare column by column Term-to-Label presence matrix 
			### NLLDA[year_].n_z_t with normalied phi_df_normed[label][phi_df_normed[label]>0]
			### Once the lowest value is set to 0, the lengths should match. 
			### Indicating that lowest value in non-normed phi_df reflects non-present terms.
			# Raw
			phi_df_raw = pd.merge(pd.DataFrame.from_dict(vocas_,orient='index'),pd.DataFrame(phi_.T).rename(columns=labelmap_),left_index=True,right_index=True).set_index(0)
			phi_df_raw = phi_df_raw.apply(lambda x: x.replace(np.min(x),0),axis=0)
			# Normed
			phi_df_normed = phi_df_raw.apply(lambda x: x/x.sum(),axis=0)

			# Collect and Save to a Dictionary
			phi_normed_dictionary[year_] = phi_df_normed
			phi_raw_dictionary[year_] = phi_df_raw

			del phi_df_normed
			del phi_df_raw
			gc.collect()

		except:
			phi_normed_dictionary[year_] = pd.DataFrame()
			phi_raw_dictionary[year_] = pd.DataFrame()

	###################################
	##### Topic Coherence
	###################################

	# Type UMASS Mimno 2011
	# Source | https://www.aclweb.org/anthology/D11-1024.pdf
	cutoff_top_terms = 25

	top_terms_Dict = {year_:{label_:[Dictionary_of_NLLDA[year_].term_to_id(top_term_) for top_term_ in phi_raw_dictionary[year_].nlargest(cutoff_top_terms,label_).index.tolist()] for label_ in phi_raw_dictionary[year_].columns.tolist()} for year_ in phi_normed_dictionary.keys()}

	def DocIDTermInCorpus(term,input_corpus):
		return [doc_id for doc_id, doc in enumerate(input_corpus) if term in doc]

	def CalculateTopicCoherence(list_top_terms,input_corpus):
		doc_ids = {term_:DocIDTermInCorpus(term_,input_corpus) for term_ in list_top_terms}
		coherence_scores_ = [np.log((float(len(list(set(doc_ids[term_])&set(doc_ids[alter_term])))+1)/len(doc_ids[term_]))) for term_ in list_top_terms for alter_term in list_top_terms if term_!=alter_term]

		return sum([term_value_ for term_value_ in coherence_scores_ if np.isfinite(term_value_)==True])

	topic_coherence_Dict = {year_:{label_:CalculateTopicCoherence(top_terms_Dict[year_][label_],Dictionary_of_NLLDA[year_].docs) for label_ in top_terms_Dict[year_].keys()} for year_ in top_terms_Dict.keys()}


	#################
	#### Output | Topic Coherence Scores
	#################

	gc.disable() # Memory Issues When Pickling 
	with bz2.BZ2File(TopicCoherence_Filename, 'w') as f:
		pickle.dump(topic_coherence_Dict, f, protocol=-1)
	f.close()
	gc.enable()